In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data = pd.read_csv("data/immobilier.csv")

In [3]:
data.shape


Out[3]:
(1460, 81)

In [4]:
data.head()


Out[4]:
Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition SalePrice
0 1 60 RL 65.0 8450 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 2 2008 WD Normal 208500
1 2 20 RL 80.0 9600 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 5 2007 WD Normal 181500
2 3 60 RL 68.0 11250 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 9 2008 WD Normal 223500
3 4 70 RL 60.0 9550 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 2 2006 WD Abnorml 140000
4 5 60 RL 84.0 14260 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 12 2008 WD Normal 250000

5 rows × 81 columns

On souhaite prédire la colonne "SalePrice". Donc toutes les autres colonnes sont des variables à faire apprendre


In [ ]:


In [ ]:


In [172]:
features = [col for col in  data.columns if col not in "SalePrice"]

In [173]:
features


Out[173]:
['Id',
 'MSSubClass',
 'MSZoning',
 'LotFrontage',
 'LotArea',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'MasVnrArea',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinSF1',
 'BsmtFinType2',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'KitchenQual',
 'TotRmsAbvGrd',
 'Functional',
 'Fireplaces',
 'FireplaceQu',
 'GarageType',
 'GarageYrBlt',
 'GarageFinish',
 'GarageCars',
 'GarageArea',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'MiscVal',
 'MoSold',
 'YrSold',
 'SaleType',
 'SaleCondition']

In [174]:
train = data[features]
y = data.SalePrice
#y = data['SalePrice']

In [175]:
train.head()


Out[175]:
Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities ... ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition
0 1 60 RL 65.0 8450 Pave NaN Reg Lvl AllPub ... 0 0 NaN NaN NaN 0 2 2008 WD Normal
1 2 20 RL 80.0 9600 Pave NaN Reg Lvl AllPub ... 0 0 NaN NaN NaN 0 5 2007 WD Normal
2 3 60 RL 68.0 11250 Pave NaN IR1 Lvl AllPub ... 0 0 NaN NaN NaN 0 9 2008 WD Normal
3 4 70 RL 60.0 9550 Pave NaN IR1 Lvl AllPub ... 0 0 NaN NaN NaN 0 2 2006 WD Abnorml
4 5 60 RL 84.0 14260 Pave NaN IR1 Lvl AllPub ... 0 0 NaN NaN NaN 0 12 2008 WD Normal

5 rows × 80 columns


In [176]:
y.head()


Out[176]:
0    208500
1    181500
2    223500
3    140000
4    250000
Name: SalePrice, dtype: int64

In [177]:
sns.distplot(y)


Out[177]:
<matplotlib.axes._subplots.AxesSubplot at 0x111c226d0>

In [131]:
# Modele pour la regression
from sklearn.linear_model import Ridge

In [49]:
import sklearn

In [50]:
sklearn.__version__


Out[50]:
'0.18.1'

In [181]:
# Initialisation du model
model_ridge = Ridge()

In [182]:
# 1) On fait apprendre le model
model_ridge.fit(train, y)

# Error ...


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-182-d7cd6d81f105> in <module>()
      1 # 1) On fait apprendre le model
----> 2 model_ridge.fit(train, y)
      3 
      4 # Error ...

/Users/babou/anaconda/lib/python2.7/site-packages/sklearn/linear_model/ridge.pyc in fit(self, X, y, sample_weight)
    640         self : returns an instance of self.
    641         """
--> 642         return super(Ridge, self).fit(X, y, sample_weight=sample_weight)
    643 
    644 

/Users/babou/anaconda/lib/python2.7/site-packages/sklearn/linear_model/ridge.pyc in fit(self, X, y, sample_weight)
    463     def fit(self, X, y, sample_weight=None):
    464         X, y = check_X_y(X, y, ['csr', 'csc', 'coo'], dtype=np.float64,
--> 465                          multi_output=True, y_numeric=True)
    466 
    467         if ((sample_weight is not None) and

/Users/babou/anaconda/lib/python2.7/site-packages/sklearn/utils/validation.pyc in check_X_y(X, y, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator)
    519     X = check_array(X, accept_sparse, dtype, order, copy, force_all_finite,
    520                     ensure_2d, allow_nd, ensure_min_samples,
--> 521                     ensure_min_features, warn_on_dtype, estimator)
    522     if multi_output:
    523         y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False,

/Users/babou/anaconda/lib/python2.7/site-packages/sklearn/utils/validation.pyc in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
    380                                       force_all_finite)
    381     else:
--> 382         array = np.array(array, dtype=dtype, order=order, copy=copy)
    383 
    384         if ensure_2d:

ValueError: could not convert string to float: Normal

Le model peux prendre en entré que des chiffre, il faut donc transformer les données en string en chiffre


In [183]:
data['SaleCondition'].head()


Out[183]:
0     Normal
1     Normal
2     Normal
3    Abnorml
4     Normal
Name: SaleCondition, dtype: object

In [184]:
pd.get_dummies(data['SaleCondition'], prefix="SaleCondition").head()


Out[184]:
SaleCondition_Abnorml SaleCondition_AdjLand SaleCondition_Alloca SaleCondition_Family SaleCondition_Normal SaleCondition_Partial
0 0 0 0 0 1 0
1 0 0 0 0 1 0
2 0 0 0 0 1 0
3 1 0 0 0 0 0
4 0 0 0 0 1 0

In [185]:
def prepare_data(data):
    
    features = [col for col in  data.columns if col not in "SalePrice"] # 80 col
    
    train = data[features]
    y = data.SalePrice
    
    # Transform Object features to columns
    train = pd.get_dummies(train)
    
    return train, y

In [186]:
train, y = prepare_data(data.copy())

In [187]:
train.head()


Out[187]:
Id MSSubClass LotFrontage LotArea OverallQual OverallCond YearBuilt YearRemodAdd MasVnrArea BsmtFinSF1 ... SaleType_ConLw SaleType_New SaleType_Oth SaleType_WD SaleCondition_Abnorml SaleCondition_AdjLand SaleCondition_Alloca SaleCondition_Family SaleCondition_Normal SaleCondition_Partial
0 1 60 65.0 8450 7 5 2003 2003 196.0 706 ... 0 0 0 1 0 0 0 0 1 0
1 2 20 80.0 9600 6 8 1976 1976 0.0 978 ... 0 0 0 1 0 0 0 0 1 0
2 3 60 68.0 11250 7 5 2001 2002 162.0 486 ... 0 0 0 1 0 0 0 0 1 0
3 4 70 60.0 9550 7 5 1915 1970 0.0 216 ... 0 0 0 1 1 0 0 0 0 0
4 5 60 84.0 14260 8 5 2000 2000 350.0 655 ... 0 0 0 1 0 0 0 0 1 0

5 rows × 289 columns


In [188]:
# 2) On fait apprendre le model
model_ridge.fit(train, y)

# Error ...


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-188-fc61ba646869> in <module>()
      1 # 2) On fait apprendre le model
----> 2 model_ridge.fit(train, y)
      3 
      4 # Error ...

/Users/babou/anaconda/lib/python2.7/site-packages/sklearn/linear_model/ridge.pyc in fit(self, X, y, sample_weight)
    640         self : returns an instance of self.
    641         """
--> 642         return super(Ridge, self).fit(X, y, sample_weight=sample_weight)
    643 
    644 

/Users/babou/anaconda/lib/python2.7/site-packages/sklearn/linear_model/ridge.pyc in fit(self, X, y, sample_weight)
    463     def fit(self, X, y, sample_weight=None):
    464         X, y = check_X_y(X, y, ['csr', 'csc', 'coo'], dtype=np.float64,
--> 465                          multi_output=True, y_numeric=True)
    466 
    467         if ((sample_weight is not None) and

/Users/babou/anaconda/lib/python2.7/site-packages/sklearn/utils/validation.pyc in check_X_y(X, y, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator)
    519     X = check_array(X, accept_sparse, dtype, order, copy, force_all_finite,
    520                     ensure_2d, allow_nd, ensure_min_samples,
--> 521                     ensure_min_features, warn_on_dtype, estimator)
    522     if multi_output:
    523         y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False,

/Users/babou/anaconda/lib/python2.7/site-packages/sklearn/utils/validation.pyc in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
    405                              % (array.ndim, estimator_name))
    406         if force_all_finite:
--> 407             _assert_all_finite(array)
    408 
    409     shape_repr = _shape_repr(array.shape)

/Users/babou/anaconda/lib/python2.7/site-packages/sklearn/utils/validation.pyc in _assert_all_finite(X)
     56             and not np.isfinite(X).all()):
     57         raise ValueError("Input contains NaN, infinity"
---> 58                          " or a value too large for %r." % X.dtype)
     59 
     60 

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [189]:
data.BsmtFinType2.value_counts(dropna=False)


Out[189]:
Unf    1256
Rec      54
LwQ      46
NaN      38
BLQ      33
ALQ      19
GLQ      14
Name: BsmtFinType2, dtype: int64

Il y a des données manquante qui ne peuvent pas etre prisent en compte par la modèle, il faut donc les remplacer


In [190]:
pd.isnull(data).sum()


Out[190]:
Id                  0
MSSubClass          0
MSZoning            0
LotFrontage       259
LotArea             0
Street              0
Alley            1369
LotShape            0
LandContour         0
Utilities           0
LotConfig           0
LandSlope           0
Neighborhood        0
Condition1          0
Condition2          0
BldgType            0
HouseStyle          0
OverallQual         0
OverallCond         0
YearBuilt           0
YearRemodAdd        0
RoofStyle           0
RoofMatl            0
Exterior1st         0
Exterior2nd         0
MasVnrType          8
MasVnrArea          8
ExterQual           0
ExterCond           0
Foundation          0
                 ... 
BedroomAbvGr        0
KitchenAbvGr        0
KitchenQual         0
TotRmsAbvGrd        0
Functional          0
Fireplaces          0
FireplaceQu       690
GarageType         81
GarageYrBlt        81
GarageFinish       81
GarageCars          0
GarageArea          0
GarageQual         81
GarageCond         81
PavedDrive          0
WoodDeckSF          0
OpenPorchSF         0
EnclosedPorch       0
3SsnPorch           0
ScreenPorch         0
PoolArea            0
PoolQC           1453
Fence            1179
MiscFeature      1406
MiscVal             0
MoSold              0
YrSold              0
SaleType            0
SaleCondition       0
SalePrice           0
dtype: int64

In [191]:
def prepare_data(data):
    
    features = [col for col in  data.columns if col not in "SalePrice"]
    
    train = data[features]
    y = data.SalePrice
    
    # Transform Object features to columns
    train = pd.get_dummies(train) 
    
    # Replace Nan value by mean of the column
    train = train.fillna(train.mean())

    return train, y

In [192]:
train, y = prepare_data(data.copy())

In [193]:
# 3) On fait apprendre le model
model_ridge.fit(train, y)

# yeah !!!


Out[193]:
Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

Compréhension du Score de performance de notre modèle :


In [5]:
from sklearn.metrics import mean_absolute_error

In [195]:
vrai = np.array([1000, 2000, 1500])
prediction = np.array([900, 2200, 1300])   # classic
#prediction = np.array([990, 2005, 1500])   # Best
#prediction = np.array([9000, 22000, 13000])    # Bad

In [196]:
mean_absolute_error(vrai, prediction)


Out[196]:
166.66666666666666

In [197]:
1000 - 900


Out[197]:
100

In [198]:
2000 - 2200


Out[198]:
-200

In [199]:
1500 - 1300


Out[199]:
200

In [200]:
(100 + 200 + 200) / 3.0


Out[200]:
166.66666666666666

Contruction de notre stratégie de Cross - Validation


In [6]:
from sklearn.model_selection import cross_val_score

In [7]:
def cross_validation(model, train, y, cv=5):
    mae = -cross_val_score(model, train, y, scoring="neg_mean_absolute_error", cv = cv)
    return mae

In [203]:
score = cross_validation(model_ridge, train, y)
print score


[ 17781.7821806   19834.62785741  19109.55973426  17004.7525164
  19526.64170021]

In [204]:
score.mean(), score.std()


Out[204]:
(18651.472797776139, 1081.0799784470737)

In [205]:
data.SalePrice.describe()


Out[205]:
count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64

Voila notre 1er score !!


In [206]:
preds = pd.DataFrame({"preds":model_ridge.predict(train), "true":y})
preds["residuals"] = np.abs(preds["true"] - preds["preds"])
preds.plot(x = "preds", y = "residuals",kind = "scatter")


Out[206]:
<matplotlib.axes._subplots.AxesSubplot at 0x111544810>

In [207]:
preds[preds.residuals >150000]


Out[207]:
preds true residuals
523 393778.481732 184750 209028.481732
898 456930.415789 611657 154726.584211
1298 341589.949296 160000 181589.949296

In [ ]:


In [ ]:


In [208]:
data.shape


Out[208]:
(1460, 81)

In [209]:
def prepare_data_outlier(data):
    
    features = [col for col in  data.columns if col not in "SalePrice"]
    
    #on enleve les id qui sont trop extreme
    data = data.drop(data.index[[523,898, 1298]])
    
    train = data[features]
    y = data.SalePrice
    
    # Transform Object features to columns
    train = pd.get_dummies(train)
    
    # Replace Nan value by mean of the column
    train = train.fillna(train.mean())
    
    return train, y

In [210]:
train, y = prepare_data_outlier(data.copy())
print train.shape, y.shape


(1457, 288) (1457,)

In [211]:
score = cross_validation(model_ridge, train, y)
print score.mean()


16905.6301599

In [213]:
train, y = prepare_data(data.copy())
print train.shape, y.shape


(1460, 289) (1460,)

In [214]:
score = cross_validation(model_ridge, train, y)
print score.mean()


18651.4727978

In [ ]:


In [ ]:


In [8]:
from sklearn.model_selection import train_test_split
X_train, X_validation, y_train, y_validation = train_test_split(train, y, random_state = 3)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-8-9bacde4f0ec5> in <module>()
      1 from sklearn.model_selection import train_test_split
----> 2 X_train, X_validation, y_train, y_validation = train_test_split(train, y, random_state = 3)

NameError: name 'train' is not defined

In [216]:
print"X_train : " + str(X_train.shape)
print"X_validation : " + str(X_validation.shape)
print"y_train : " + str(y_train.shape)
print"y_validation : " + str(y_validation.shape)


X_train : (1095, 289)
X_validation : (365, 289)
y_train : (1095,)
y_validation : (365,)

In [217]:
model_ridge.fit(X_train, y_train)


Out[217]:
Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [218]:
mes_predictions = model_ridge.predict(X_validation)

In [219]:
# Mes prédiction
mes_predictions[0:5]


Out[219]:
array([ 123579.7980259 ,  130806.08380828,  189586.80408004,
        214612.50594791,  275183.97258377])

In [220]:
# Les vrai valeurs 
y_validation[0:5]


Out[220]:
140     115000
950     129000
248     180000
1360    189000
568     316600
Name: SalePrice, dtype: int64

In [221]:
mean_absolute_error(y_validation, mes_predictions)


Out[221]:
20154.930350137409

In [222]:
plt.scatter(mes_predictions, y_validation)
plt.plot([min(mes_predictions),max(mes_predictions)], [min(mes_predictions),max(mes_predictions)], c="red")
plt.xlabel('Mes predicitons')
plt.ylabel('Vrai valeurs')


Out[222]:
<matplotlib.text.Text at 0x11164c050>

Une valeur semble complétement perdu; on prédit 900.000 alors qu'elle devrait etre à moins de 200.000 ...


In [223]:
analyse = X_validation.copy()

In [224]:
analyse.head()


Out[224]:
Id MSSubClass LotFrontage LotArea OverallQual OverallCond YearBuilt YearRemodAdd MasVnrArea BsmtFinSF1 ... SaleType_ConLw SaleType_New SaleType_Oth SaleType_WD SaleCondition_Abnorml SaleCondition_AdjLand SaleCondition_Alloca SaleCondition_Family SaleCondition_Normal SaleCondition_Partial
140 141 20 70.0 10500 4 5 1971 1971 0.0 432 ... 0 0 0 0 0 0 0 0 1 0
950 951 20 60.0 7200 5 8 1950 2002 0.0 398 ... 0 0 0 1 0 0 0 0 1 0
248 249 60 72.0 11317 7 5 2003 2003 101.0 0 ... 0 0 0 1 0 0 0 0 1 0
1360 1361 70 51.0 9842 5 6 1921 1998 0.0 0 ... 0 0 0 1 0 0 0 0 1 0
568 569 50 79.0 12327 8 8 1983 2009 0.0 1441 ... 0 0 0 1 0 0 0 0 1 0

5 rows × 289 columns


In [225]:
analyse['prix'] = y_validation

In [226]:
analyse.head()


Out[226]:
Id MSSubClass LotFrontage LotArea OverallQual OverallCond YearBuilt YearRemodAdd MasVnrArea BsmtFinSF1 ... SaleType_New SaleType_Oth SaleType_WD SaleCondition_Abnorml SaleCondition_AdjLand SaleCondition_Alloca SaleCondition_Family SaleCondition_Normal SaleCondition_Partial prix
140 141 20 70.0 10500 4 5 1971 1971 0.0 432 ... 0 0 0 0 0 0 0 1 0 115000
950 951 20 60.0 7200 5 8 1950 2002 0.0 398 ... 0 0 1 0 0 0 0 1 0 129000
248 249 60 72.0 11317 7 5 2003 2003 101.0 0 ... 0 0 1 0 0 0 0 1 0 180000
1360 1361 70 51.0 9842 5 6 1921 1998 0.0 0 ... 0 0 1 0 0 0 0 1 0 189000
568 569 50 79.0 12327 8 8 1983 2009 0.0 1441 ... 0 0 1 0 0 0 0 1 0 316600

5 rows × 290 columns


In [227]:
analyse['prediction'] = mes_predictions

In [228]:
analyse.head()


Out[228]:
Id MSSubClass LotFrontage LotArea OverallQual OverallCond YearBuilt YearRemodAdd MasVnrArea BsmtFinSF1 ... SaleType_Oth SaleType_WD SaleCondition_Abnorml SaleCondition_AdjLand SaleCondition_Alloca SaleCondition_Family SaleCondition_Normal SaleCondition_Partial prix prediction
140 141 20 70.0 10500 4 5 1971 1971 0.0 432 ... 0 0 0 0 0 0 1 0 115000 123579.798026
950 951 20 60.0 7200 5 8 1950 2002 0.0 398 ... 0 1 0 0 0 0 1 0 129000 130806.083808
248 249 60 72.0 11317 7 5 2003 2003 101.0 0 ... 0 1 0 0 0 0 1 0 180000 189586.804080
1360 1361 70 51.0 9842 5 6 1921 1998 0.0 0 ... 0 1 0 0 0 0 1 0 189000 214612.505948
568 569 50 79.0 12327 8 8 1983 2009 0.0 1441 ... 0 1 0 0 0 0 1 0 316600 275183.972584

5 rows × 291 columns


In [229]:
analyse[analyse.prediction >= 800000]


Out[229]:
Id MSSubClass LotFrontage LotArea OverallQual OverallCond YearBuilt YearRemodAdd MasVnrArea BsmtFinSF1 ... SaleType_Oth SaleType_WD SaleCondition_Abnorml SaleCondition_AdjLand SaleCondition_Alloca SaleCondition_Family SaleCondition_Normal SaleCondition_Partial prix prediction
1298 1299 60 313.0 63887 10 5 2008 2008 796.0 5644 ... 0 0 0 0 0 0 0 1 160000 903754.455156

1 rows × 291 columns


In [230]:
sns.countplot(data.SaleCondition)


Out[230]:
<matplotlib.axes._subplots.AxesSubplot at 0x111645a50>

transformation de notre Prix pour améliorer le score :


In [231]:
sns.distplot(data.SalePrice)


Out[231]:
<matplotlib.axes._subplots.AxesSubplot at 0x114a30350>

In [232]:
data.SalePrice.describe()


Out[232]:
count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64

In [233]:
sns.distplot(np.log1p(data.SalePrice))


Out[233]:
<matplotlib.axes._subplots.AxesSubplot at 0x103a74d10>

In [234]:
np.log1p(data.SalePrice).describe()


Out[234]:
count    1460.000000
mean       12.024057
std         0.399449
min        10.460271
25%        11.775105
50%        12.001512
75%        12.273736
max        13.534474
Name: SalePrice, dtype: float64

In [235]:
def prepare_data_log(data):
    
    features = [col for col in  data.columns if col not in "SalePrice"]
    
    train = data[features]
    y = data.SalePrice
    # Transforme log
    y = np.log1p(y)
    
    # Transform Object features to columns
    train = pd.get_dummies(train) 
    
    # Replace Nan value by mean of the column
    train = train.fillna(train.mean())

    return train, y

In [236]:
train, y = prepare_data_log(data)

In [237]:
score = cross_validation(model_ridge, train, y)
print score.mean()


0.0902204189461

In [238]:
X_train, X_validation, y_train, y_validation = train_test_split(train, y, random_state = 3)

In [239]:
model_ridge.fit(X_train, y_train)


Out[239]:
Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [240]:
mes_predictions = model_ridge.predict(X_validation)

In [241]:
mes_predictions[0:5]


Out[241]:
array([ 11.56322985,  11.84090375,  12.16291362,  12.18622392,  12.50288167])

In [242]:
# Les vrai valeurs 
y_validation[0:5]


Out[242]:
140     11.652696
950     11.767575
248     12.100718
1360    12.149508
568     12.665398
Name: SalePrice, dtype: float64

In [243]:
mean_absolute_error(y_validation, mes_predictions)


Out[243]:
0.090238412085009004

In [ ]:


In [244]:
mes_predictions_exp = np.expm1(mes_predictions)
y_validation_exp = np.expm1(y_validation)

In [245]:
# Redonner les valeurs un transformation normal (exp)
mean_absolute_error(y_validation_exp, mes_predictions_exp)


Out[245]:
20652.779026301429

In [ ]:


In [246]:
plt.scatter(mes_predictions_exp, y_validation_exp)
plt.plot([min(mes_predictions_exp),max(mes_predictions_exp)], [min(mes_predictions_exp),max(mes_predictions_exp)]
         , c="red")
plt.xlabel('Mes predicitons')
plt.ylabel('Vrai valeurs')


Out[246]:
<matplotlib.text.Text at 0x10fff0510>

In [247]:
def prepare_data_outlier_log(data):
    
    features = [col for col in  data.columns if col not in "SalePrice"]
    
    #on enleve les id qui sont trop extreme
    data = data.drop(data.index[[523,898, 1298]])
    
    train = data[features]
    y = data.SalePrice
    # Transforme log
    y = np.log1p(y)
    
    # Transform Object features to columns
    train = pd.get_dummies(train) 
    
    # Replace Nan value by mean of the column
    train = train.fillna(train.mean())

    return train, y

In [248]:
train, y = prepare_data_outlier_log(data.copy())

In [249]:
score = cross_validation(model_ridge, train, y)
print score.mean()


0.0828581382524

In [250]:
X_train, X_validation, y_train, y_validation = train_test_split(train, y, random_state = 3)

In [251]:
model_ridge.fit(X_train, y_train)


Out[251]:
Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [252]:
mes_predictions = model_ridge.predict(X_validation)

In [253]:
mes_predictions[0:5]


Out[253]:
array([ 11.48383154,  11.69299838,  12.16844758,  11.7510565 ,  11.87693117])

In [254]:
# Les vrai valeurs 
y_validation[0:5]


Out[254]:
140    11.652696
951    11.694422
248    12.100718
738    12.095147
569    11.820123
Name: SalePrice, dtype: float64

In [255]:
mes_predictions_exp = np.expm1(mes_predictions)
y_validation_exp = np.expm1(y_validation)

In [256]:
# Redonner les valeurs un transformation normal (exp)
mean_absolute_error(y_validation_exp, mes_predictions_exp)


Out[256]:
13946.253895956104

In [257]:
plt.scatter(mes_predictions_exp, y_validation_exp)
plt.plot([min(mes_predictions_exp),max(mes_predictions_exp)], [min(mes_predictions_exp),max(mes_predictions_exp)]
         , c="red")
plt.xlabel('Mes predicitons')
plt.ylabel('Vrai valeurs')


Out[257]:
<matplotlib.text.Text at 0x1101ab990>

Paramettre d'un modèle


In [130]:
model_ridge = Ridge()


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-130-5682565639a0> in <module>()
----> 1 model_ridge = Ridge()

NameError: name 'Ridge' is not defined

In [259]:
model_ridge


Out[259]:
Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [260]:
#alphas = [0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 30, 50, 75]
alphas = [10, 10.5, 11, 11.5, 12, 12.5, 13, 13.5, 14, 14.5, 15, 15.5, 16]
cv_ridge = [cross_validation(Ridge(alpha = alpha ,random_state=42), train, y).mean() 
            for alpha in alphas]

In [261]:
cv_ridge = pd.Series(cv_ridge, index = alphas)
cv_ridge.plot()
plt.xlabel("alpha")
plt.ylabel("mean absolute error")


Out[261]:
<matplotlib.text.Text at 0x114180750>

In [262]:
cv_ridge.argmin()


Out[262]:
13.5

In [263]:
cv_ridge


Out[263]:
10.0    0.079780
10.5    0.079774
11.0    0.079770
11.5    0.079768
12.0    0.079765
12.5    0.079760
13.0    0.079758
13.5    0.079757
14.0    0.079760
14.5    0.079762
15.0    0.079767
15.5    0.079771
16.0    0.079776
dtype: float64

In [265]:
score = cross_validation(Ridge(alpha=13.5, random_state=42), train, y)
print score.mean()


0.079757180837

In [266]:
X_train, X_validation, y_train, y_validation = train_test_split(train, y, random_state = 3)

In [267]:
model_ridge = Ridge(alpha=13.5, random_state=42)
model_ridge.fit(X_train, y_train)


Out[267]:
Ridge(alpha=13.5, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=42, solver='auto', tol=0.001)

In [268]:
mes_predictions_exp = np.expm1(model_ridge.predict(X_validation))

In [269]:
y_validation_exp = np.expm1(y_validation)

In [270]:
mean_absolute_error(y_validation_exp, mes_predictions_exp)


Out[270]:
13713.683434371746

In [271]:
plt.scatter(mes_predictions_exp, y_validation_exp)
plt.plot([min(mes_predictions_exp),max(mes_predictions_exp)], [min(mes_predictions_exp),max(mes_predictions_exp)]
         , c="red")
plt.xlabel('Mes predicitons')
plt.ylabel('Vrai valeurs')


Out[271]:
<matplotlib.text.Text at 0x114ddbc50>

Aller plus loin :


In [104]:
data.plot(kind='scatter', x="1stFlrSF", y='SalePrice')


Out[104]:
<matplotlib.axes._subplots.AxesSubplot at 0x1150fa490>

In [276]:
data.plot(kind='scatter', x="2ndFlrSF", y='SalePrice')


Out[276]:
<matplotlib.axes._subplots.AxesSubplot at 0x115796fd0>

In [277]:
data['1stFlr_2ndFlr_Sf'] = data['1stFlrSF'] + data['2ndFlrSF']

In [278]:
data.plot(kind='scatter', x="1stFlr_2ndFlr_Sf", y='SalePrice')


Out[278]:
<matplotlib.axes._subplots.AxesSubplot at 0x111889910>

In [280]:
sns.distplot(np.log1p(data['1stFlr_2ndFlr_Sf']))


Out[280]:
<matplotlib.axes._subplots.AxesSubplot at 0x111889bd0>

In [108]:
data[(data['1stFlr_2ndFlr_Sf'] > 4000) & (data.SalePrice <= 700000)]


Out[108]:
Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities ... PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition SalePrice 1stFlr_2ndFlr_Sf
523 524 60 RL 130.0 40094 Pave NaN IR1 Bnk AllPub ... NaN NaN NaN 0 10 2007 New Partial 184750 4676
1298 1299 60 RL 313.0 63887 Pave NaN IR3 Bnk AllPub ... Gd NaN NaN 0 1 2008 New Partial 160000 5642

2 rows × 82 columns


In [320]:
def prepare_data_outlier_log_plus(data):

    
    #on enleve les id qui sont trop extreme
    data = data.drop(data.index[[523,898, 1298]])
    
    # Ajout de nouvelle variables
    data['1stFlr_2ndFlr_Sf'] = np.log1p(data['1stFlrSF'] + data['2ndFlrSF'])
    
    
    features = [col for col in  data.columns if col not in "SalePrice"]
    
    train = data[features]
    
    y = data.SalePrice
    # Transforme log
    y = np.log1p(y)
    
    # Transform Object features to columns
    train = pd.get_dummies(train) 
    
    # Replace Nan value by mean of the column
    train = train.fillna(train.mean())
    
    print train.shape

    return train, y

In [321]:
train, y = prepare_data_outlier_log_plus(data.copy())


(1457, 289)

In [322]:
score = cross_validation(Ridge(alpha=13.5, random_state=42), train, y)
print score.mean()


0.0792999760212

In [323]:
X_train, X_validation, y_train, y_validation = train_test_split(train, y, random_state = 3)

In [324]:
model_ridge = Ridge(alpha=13.5, random_state=42)
model_ridge.fit(X_train, y_train)


Out[324]:
Ridge(alpha=13.5, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=42, solver='auto', tol=0.001)

In [325]:
mes_predictions_exp = np.expm1(model_ridge.predict(X_validation))

In [326]:
y_validation_exp = np.expm1(y_validation)

In [327]:
mean_absolute_error(y_validation_exp, mes_predictions_exp)


Out[327]:
13595.418391239062

In [328]:
plt.scatter(mes_predictions_exp, y_validation_exp)
plt.plot([min(mes_predictions_exp),max(mes_predictions_exp)], [min(mes_predictions_exp),max(mes_predictions_exp)]
         , c="red")
plt.xlabel('Mes predicitons')
plt.ylabel('Vrai valeurs')


Out[328]:
<matplotlib.text.Text at 0x11477cdd0>

In [329]:
model_ridge.coef_[0:10]


Out[329]:
array([ -1.17526078e-05,  -1.62168730e-04,   4.23445411e-04,
         2.34264991e-06,   4.67261807e-02,   4.41918434e-02,
         2.01984717e-03,   5.90569796e-04,   4.30867731e-06,
         5.59029977e-05])

In [330]:
coef = pd.Series(model_ridge.coef_, index = X_train.columns)
# On prend les 10 plus important features postive et négative
nb_important = 25
imp_coef = pd.concat([coef.sort_values().head(nb_important),
                     coef.sort_values().tail(nb_important)])
imp_coef.plot(kind = "barh", figsize=(10, 8))
plt.title("Coefficients in Model")


Out[330]:
<matplotlib.text.Text at 0x116600d90>

Encore plus loin :


In [61]:
# Pour afficher des images (pas besoin de taper cet import)
from IPython.display import Image

In [45]:
data[['YearBuilt', 'GarageYrBlt']].head()


Out[45]:
YearBuilt GarageYrBlt
0 2003 2003.0
1 1976 1976.0
2 2001 2001.0
3 1915 1998.0
4 2000 2000.0

In [46]:
df = data.copy() # To work on df with no change in Daframe data

In [47]:
df['build_home_garage_same_year'] = 0
df.loc[data['YearBuilt'] == data['GarageYrBlt'], 'build_home_garage_same_year'] = 1

In [48]:
df.build_home_garage_same_year.value_counts()


Out[48]:
1    1089
0     371
Name: build_home_garage_same_year, dtype: int64

In [297]:
def prepare_data_outlier_log_plus_2(data):

    
    #on enleve les id qui sont trop extreme
    data = data.drop(data.index[[523,898, 1298]])
    data['1stFlr_2ndFlr_Sf'] = np.log1p(data['1stFlrSF'] + data['2ndFlrSF'])
    
    data['build_home_garage_same_year'] = "N"
    data.loc[data['YearBuilt'] == data['GarageYrBlt'], 'build_home_garage_same_year'] = "Y"
    
    features = [col for col in  data.columns if col not in "SalePrice"]
    
    train = data[features]
    
    y = data.SalePrice
    # Transforme log
    y = np.log1p(y)
    
    # Transform Object features to columns
    train = pd.get_dummies(train) 
    
    # Replace Nan value by mean of the column
    train = train.fillna(train.mean())
    
    print train.shape

    return train, y

In [298]:
train.head()


Out[298]:
Id MSSubClass LotFrontage LotArea OverallQual OverallCond YearBuilt YearRemodAdd MasVnrArea BsmtFinSF1 ... SaleType_ConLw SaleType_New SaleType_Oth SaleType_WD SaleCondition_Abnorml SaleCondition_AdjLand SaleCondition_Alloca SaleCondition_Family SaleCondition_Normal SaleCondition_Partial
0 1 60 65.0 8450 7 5 2003 2003 196.0 706 ... 0 0 0 1 0 0 0 0 1 0
1 2 20 80.0 9600 6 8 1976 1976 0.0 978 ... 0 0 0 1 0 0 0 0 1 0
2 3 60 68.0 11250 7 5 2001 2002 162.0 486 ... 0 0 0 1 0 0 0 0 1 0
3 4 70 60.0 9550 7 5 1915 1970 0.0 216 ... 0 0 0 1 1 0 0 0 0 0
4 5 60 84.0 14260 8 5 2000 2000 350.0 655 ... 0 0 0 1 0 0 0 0 1 0

5 rows × 290 columns


In [50]:
data.shape


Out[50]:
(1460, 81)

In [317]:
train, y = prepare_data_outlier_log_plus_2(data.copy())


(1457, 291)

In [318]:
score = cross_validation(Ridge(alpha=13.5, random_state=42), train, y)
print score.mean()


0.0794684185071

In [35]:
# Best was 0.0794542370234 donc ce n'est pas positif comme features

Toujours tester un ajout de features pour savoir si celle-ci va avoir un impact positif ou négatif

Regardons la colonnes "MasVnrArea" & "MasVnrType" :


In [309]:
df.MasVnrType.value_counts()


Out[309]:
None       864
BrkFace    445
Stone      128
BrkCmn      15
Name: MasVnrType, dtype: int64

In [310]:
df.MasVnrType.head()


Out[310]:
0    BrkFace
1       None
2    BrkFace
3       None
4    BrkFace
Name: MasVnrType, dtype: object

In [357]:
#df.MasVnrArea.value_counts()

In [312]:
df.shape


Out[312]:
(1460, 82)

In [313]:
df[df.MasVnrType == "None"].MasVnrArea.value_counts()


Out[313]:
0.0      859
1.0        2
312.0      1
344.0      1
288.0      1
Name: MasVnrArea, dtype: int64

In [331]:
def prepare_data_outlier_log_plus_3(data):

    
    #on enleve les id qui sont trop extreme
    data = data.drop(data.index[[523,898, 1298]])
    data['1stFlr_2ndFlr_Sf'] = np.log1p(data['1stFlrSF'] + data['2ndFlrSF'])
    
    data.loc[data.MasVnrType == 'None', 'MasVnrArea'] = 0
    
    features = [col for col in  data.columns if col not in "SalePrice"]
    
    train = data[features]
    
    y = data.SalePrice
    # Transforme log
    y = np.log1p(y)
    
    # Transform Object features to columns
    train = pd.get_dummies(train) 
    
    # Replace Nan value by mean of the column
    train = train.fillna(train.mean())
    
    print train.shape

    return train, y

In [ ]:


In [332]:
train, y = prepare_data_outlier_log_plus_3(data.copy())


(1457, 289)

In [333]:
score = cross_validation(Ridge(alpha=13.5, random_state=42), train, y)
print score.mean()


0.0793025166395

In [64]:
Image(url="http://i.giphy.com/GPq3wxmLbwUGA.gif")


Out[64]:

BsmtFinType1 & BsmtFinSF1 / BsmtFinType2 & BsmtFinSF2


In [341]:
df.BsmtFinType2.value_counts(dropna=False)


Out[341]:
Unf    1256
Rec      54
LwQ      46
NaN      38
BLQ      33
ALQ      19
GLQ      14
Name: BsmtFinType2, dtype: int64

In [343]:
df.BsmtFinSF2.describe()


Out[343]:
count    1460.000000
mean       46.549315
std       161.319273
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max      1474.000000
Name: BsmtFinSF2, dtype: float64

In [344]:
df[pd.isnull(df.BsmtFinType2)].BsmtFinSF2.value_counts()


Out[344]:
0      37
479     1
Name: BsmtFinSF2, dtype: int64

In [354]:
def prepare_data_outlier_log_plus_4(data):

    
    #on enleve les id qui sont trop extreme
    data = data.drop(data.index[[523,898, 1298]])
    data['1stFlr_2ndFlr_Sf'] = np.log1p(data['1stFlrSF'] + data['2ndFlrSF'])
    
    data.loc[data.MasVnrType == 'None', 'MasVnrArea'] = 0
    
    data.loc[pd.isnull(data.BsmtFinType2), 'BsmtFinSF2'] = 0
    
    features = [col for col in  data.columns if col not in "SalePrice"]
    
    train = data[features]
    
    
    y = data.SalePrice
    # Transforme log
    y = np.log1p(y)
    
    # Transform Object features to columns
    train = pd.get_dummies(train) 
    
    # Replace Nan value by mean of the column
    train = train.fillna(train.mean())
    
    print train.shape

    return train, y

In [355]:
train, y = prepare_data_outlier_log_plus_4(data.copy())


(1457, 289)

In [356]:
score = cross_validation(Ridge(alpha=13.5, random_state=42), train, y)
print score.mean()


0.0792739925963

In [230]:
X_train_ridge, X_validation_ridge, y_train_ridge, y_validation_ridge = train_test_split(train, y, random_state = 3)

In [231]:
model_ridge = Ridge(alpha=13.5, random_state=42)
model_ridge.fit(X_train_ridge, y_train_ridge)


Out[231]:
Ridge(alpha=13.5, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=42, solver='auto', tol=0.001)

In [137]:
coef = pd.Series(np.abs(model_ridge.coef_), index = X_train.columns)
# On prend les 10 plus important features postive et négative
nb_important = 15
#imp_coef = pd.concat([coef.sort_values().head(nb_important),
           #          coef.sort_values().tail(nb_important)])
    
imp_coef = coef.sort_values().head(nb_important)
imp_coef.plot(kind = "barh", figsize=(10, 8))
plt.title("Coefficients in Model")


Out[137]:
<matplotlib.text.Text at 0x117adcb90>

In [367]:
coef.sort_values().head(10)


Out[367]:
GarageCond_Ex        0.000000
Condition2_RRAe      0.000000
Exterior1st_Stone    0.000000
MiscFeature_TenC     0.000000
MiscVal              0.000001
BsmtUnfSF            0.000002
LotArea              0.000002
MasVnrArea           0.000003
GarageYrBlt          0.000007
Id                   0.000012
dtype: float64

In [368]:
features_to_delete = ["GarageCond_Ex",
"Condition2_RRAe",
"Exterior1st_Stone",
"MiscFeature_TenC",
"MiscVal",
"BsmtUnfSF",
"LotArea",
"MasVnrArea",
"GarageYrBlt",
"Id"]

In [369]:
def prepare_data_outlier_log_plus_4_bis(data):

    
    #on enleve les id qui sont trop extreme
    data = data.drop(data.index[[523,898, 1298]])
    data['1stFlr_2ndFlr_Sf'] = np.log1p(data['1stFlrSF'] + data['2ndFlrSF'])
    
    data.loc[data.MasVnrType == 'None', 'MasVnrArea'] = 0
    
    data.loc[pd.isnull(data.BsmtFinType2), 'BsmtFinSF2'] = 0
    
    features = [col for col in  data.columns if col not in "SalePrice"]
    
    train = data[features]
    
    
    y = data.SalePrice
    # Transforme log
    y = np.log1p(y)
    
    # Transform Object features to columns
    train = pd.get_dummies(train) 
    
    # Replace Nan value by mean of the column
    train = train.fillna(train.mean())
    
    train = train.drop(features_to_delete, axis=1)
    
    print train.shape

    return train, y

In [370]:
train, y = prepare_data_outlier_log_plus_4_bis(data.copy())


(1457, 279)

In [371]:
score = cross_validation(Ridge(alpha=13.5, random_state=42), train, y)
print score.mean()


0.0800835352745

In [375]:
#pd.isnull(df).sum()

In [376]:
df.shape


Out[376]:
(1460, 82)

In [122]:
column_detail = pd.DataFrame(pd.isnull(df).sum(), columns=['nbr_null'])
column_detail.sort_values('nbr_null', ascending=0, inplace=True)
column_detail.head(10)


Out[122]:
nbr_null
PoolQC 1453
MiscFeature 1406
Alley 1369
Fence 1179
FireplaceQu 690
LotFrontage 259
GarageType 81
GarageYrBlt 81
GarageCond 81
GarageQual 81

In [227]:
def prepare_data_outlier_log_plus_5(data):

    
    #on enleve les id qui sont trop extreme
    data = data.drop(data.index[[523,898, 1298]])
    data['1stFlr_2ndFlr_Sf'] = np.log1p(data['1stFlrSF'] + data['2ndFlrSF'])
    
    data.loc[data.MasVnrType == 'None', 'MasVnrArea'] = 0
    
    data.loc[pd.isnull(data.BsmtFinType2), 'BsmtFinSF2'] = 0
    
    #Drop features with too much Null value
    data = data.drop('PoolQC', axis=1)
    data = data.drop('MiscFeature', axis=1)
    data = data.drop('Alley', axis=1)
    data = data.drop('Fence', axis=1)
    
    features = [col for col in  data.columns if col not in "SalePrice"]
    
    train = data[features]
    
    y = data.SalePrice
    # Transforme log
    y = np.log1p(y)
    
    # Transform Object features to columns
    train = pd.get_dummies(train) 
    
    # Replace Nan value by mean of the column
    train = train.fillna(train.mean())
    
    print train.shape

    return train, y

In [228]:
train, y = prepare_data_outlier_log_plus_5(data.copy())


(1457, 276)

In [134]:
score = cross_validation(Ridge(alpha=13.5, random_state=42), train, y)
print score.mean()


0.0787093812383

In [380]:
Image(url="http://i.giphy.com/LZfZXcFNOOzw4.gif")


Out[380]:

Let's talk about Tree now :


In [73]:
from sklearn.tree import DecisionTreeRegressor

In [96]:
np.random.seed(42)

In [74]:
dt = DecisionTreeRegressor(random_state=0)

In [75]:
def dt_prepare_data(data):
    
    features = [col for col in  data.columns if col not in "SalePrice"]
    
    train = data[features]
    y = data.SalePrice
    
    # Replace Nan value by mean of the column
    train = train.fillna(train.mean())
    
    print train.shape

    return train, y

In [76]:
train, y = dt_prepare_data(data.copy())


(1460, 80)

In [77]:
train.head()


Out[77]:
Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities ... ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition
0 1 60 RL 65.0 8450 Pave NaN Reg Lvl AllPub ... 0 0 NaN NaN NaN 0 2 2008 WD Normal
1 2 20 RL 80.0 9600 Pave NaN Reg Lvl AllPub ... 0 0 NaN NaN NaN 0 5 2007 WD Normal
2 3 60 RL 68.0 11250 Pave NaN IR1 Lvl AllPub ... 0 0 NaN NaN NaN 0 9 2008 WD Normal
3 4 70 RL 60.0 9550 Pave NaN IR1 Lvl AllPub ... 0 0 NaN NaN NaN 0 2 2006 WD Abnorml
4 5 60 RL 84.0 14260 Pave NaN IR1 Lvl AllPub ... 0 0 NaN NaN NaN 0 12 2008 WD Normal

5 rows × 80 columns


In [78]:
# 1) On fait apprendre le model
dt.fit(train, y)

#Error...


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-78-b3eaf1946320> in <module>()
      1 # 1) On fait apprendre le model
----> 2 dt.fit(train, y)
      3 
      4 #Error...

/Users/babou/anaconda/lib/python2.7/site-packages/sklearn/tree/tree.pyc in fit(self, X, y, sample_weight, check_input, X_idx_sorted)
   1027             sample_weight=sample_weight,
   1028             check_input=check_input,
-> 1029             X_idx_sorted=X_idx_sorted)
   1030         return self
   1031 

/Users/babou/anaconda/lib/python2.7/site-packages/sklearn/tree/tree.pyc in fit(self, X, y, sample_weight, check_input, X_idx_sorted)
    120         random_state = check_random_state(self.random_state)
    121         if check_input:
--> 122             X = check_array(X, dtype=DTYPE, accept_sparse="csc")
    123             y = check_array(y, ensure_2d=False, dtype=None)
    124             if issparse(X):

/Users/babou/anaconda/lib/python2.7/site-packages/sklearn/utils/validation.pyc in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
    380                                       force_all_finite)
    381     else:
--> 382         array = np.array(array, dtype=dtype, order=order, copy=copy)
    383 
    384         if ensure_2d:

ValueError: could not convert string to float: Normal

In [97]:
from sklearn.preprocessing import LabelEncoder

Problème avec nos données en string...


In [98]:
categoricals = [x for x in data.columns if data[x].dtype == 'object']

In [99]:
categoricals


Out[99]:
['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

In [100]:
data.SaleCondition.head()


Out[100]:
0     Normal
1     Normal
2     Normal
3    Abnorml
4     Normal
Name: SaleCondition, dtype: object

In [101]:
lbl = LabelEncoder() # Initialisation
lbl.fit(data['SaleCondition'].values)
test = lbl.transform(data['SaleCondition'].values)

In [102]:
test[0:5]


Out[102]:
array([4, 4, 4, 0, 4])

On change chaque valeur en string en valeur numérique


In [103]:
def dt_prepare_data_plus(data):
    
    features = [col for col in  data.columns if col not in "SalePrice"]
    
    train = data[features]
    y = data.SalePrice
    
    # String problem
    categoricals = [x for x in train.columns if train[x].dtype == 'object']
    for col in categoricals:
        lbl = LabelEncoder()
        lbl.fit(train[col].values)
        train[col] = lbl.transform(train[col].values)
    
    # Replace Nan value by mean of the column
    train = train.fillna(train.mean())
    
    print train.shape

    return train, y

In [104]:
train, y = dt_prepare_data_plus(data.copy())


/Users/babou/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
(1460, 80)

In [105]:
train.head()


Out[105]:
Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities ... ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition
0 1 60 3 65.0 8450 1 0 3 3 0 ... 0 0 0 0 0 0 2 2008 8 4
1 2 20 3 80.0 9600 1 0 3 3 0 ... 0 0 0 0 0 0 5 2007 8 4
2 3 60 3 68.0 11250 1 0 0 3 0 ... 0 0 0 0 0 0 9 2008 8 4
3 4 70 3 60.0 9550 1 0 0 3 0 ... 0 0 0 0 0 0 2 2006 8 0
4 5 60 3 84.0 14260 1 0 0 3 0 ... 0 0 0 0 0 0 12 2008 8 4

5 rows × 80 columns


In [106]:
data.head()


Out[106]:
Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition SalePrice
0 1 60 RL 65.0 8450 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 2 2008 WD Normal 208500
1 2 20 RL 80.0 9600 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 5 2007 WD Normal 181500
2 3 60 RL 68.0 11250 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 9 2008 WD Normal 223500
3 4 70 RL 60.0 9550 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 2 2006 WD Abnorml 140000
4 5 60 RL 84.0 14260 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 12 2008 WD Normal 250000

5 rows × 81 columns


In [108]:
score = cross_validation(dt, train, y)
print score.mean()


27604.8239726

In [109]:
def dt_prepare_data_plus_log(data):
    
    features = [col for col in  data.columns if col not in "SalePrice"]
    
    train = data[features]
    y = data.SalePrice
    # Transforme log
    y = np.log1p(y)
    
    # String problem
    categoricals = [x for x in train.columns if train[x].dtype == 'object']
    for col in categoricals:
        lbl = LabelEncoder()
        lbl.fit(train[col].values)
        train[col] = lbl.transform(train[col].values)
    
    # Replace Nan value by mean of the column
    train = train.fillna(train.mean())
    
    print train.shape

    return train, y

In [110]:
train, y = dt_prepare_data_plus_log(data.copy())


/Users/babou/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:15: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
(1460, 80)

In [112]:
score = cross_validation(dt, train, y)
print score.mean()


0.147558095984

In [113]:
from sklearn.ensemble import RandomForestRegressor

In [114]:
rfr = RandomForestRegressor(random_state=0)

In [115]:
score = cross_validation(rfr, train, y)
print score.mean()


0.104381544338

In [149]:
X_train, X_validation, y_train, y_validation = train_test_split(train, y, random_state = 3)

In [150]:
rfr.fit(X_train, y_train)


Out[150]:
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=0,
           verbose=0, warm_start=False)

In [151]:
mes_predictions_exp = np.expm1(rfr.predict(X_validation))

In [152]:
mes_predictions_exp[0:5]


Out[152]:
array([ 103319.91975369,  135961.29972815,  202308.67619832,
        132864.2976212 ,  124147.72343506])

In [36]:
y_validation_exp = np.exp(y_validation)

In [122]:
y_validation_exp[0:5]


Out[122]:
140     115001.0
950     129001.0
248     180001.0
1360    189001.0
568     316601.0
Name: SalePrice, dtype: float64

In [123]:
mean_absolute_error(y_validation_exp, mes_predictions_exp)


Out[123]:
19435.848209734722

In [124]:
plt.scatter(mes_predictions_exp, y_validation_exp)
plt.plot([min(mes_predictions_exp),max(mes_predictions_exp)], [min(mes_predictions_exp),max(mes_predictions_exp)]
         , c="red")
plt.xlabel('Mes predicitons')
plt.ylabel('Vrai valeurs')


Out[124]:
<matplotlib.text.Text at 0x112a3e550>

In [147]:
def dt_prepare_data_plus_log_1(data):
    
    #on enleve les id qui sont trop extreme
    data = data.drop(data.index[[523,898, 1298]])
    data['1stFlr_2ndFlr_Sf'] = np.log1p(data['1stFlrSF'] + data['2ndFlrSF'])
    
    data.loc[data.MasVnrType == 'None', 'MasVnrArea'] = 0
    
    data.loc[pd.isnull(data.BsmtFinType2), 'BsmtFinSF2'] = 0
    
    features = [col for col in  data.columns if col not in "SalePrice"]
    
    train = data[features]
    y = data.SalePrice
    # Transforme log
    y = np.log1p(y)
    
    # String problem
    categoricals = [x for x in train.columns if train[x].dtype == 'object']
    for col in categoricals:
        lbl = LabelEncoder()
        lbl.fit(train[col].values)
        train[col] = lbl.transform(train[col].values)
    
    # Replace Nan value by mean of the column
    train = train.fillna(train.mean())
    
    print train.shape

    return train, y

In [148]:
train, y = dt_prepare_data_plus_log_1(data.copy())


/Users/babou/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:23: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
(1457, 81)

In [127]:
score = cross_validation(rfr, train, y)
print score.mean()


0.103007546782

In [160]:
pd.DataFrame?

In [165]:
coef = pd.DataFrame({'col' : X_train.columns,'importance' : rfr.feature_importances_})
coef = coef.sort_values('importance', ascending=False)
top_tree_features = coef.col.head(25)
#plt.figure(figsize=(10, 5))
#coef.head(25).plot(kind='bar')
#plt.title('Feature Significance')

In [167]:
top_tree_features


Out[167]:
17         OverallQual
80    1stFlr_2ndFlr_Sf
38         TotalBsmtSF
43            1stFlrSF
19           YearBuilt
61          GarageCars
46           GrLivArea
4              LotArea
62          GarageArea
34          BsmtFinSF1
18         OverallCond
2             MSZoning
3          LotFrontage
41          CentralAir
24         Exterior2nd
12        Neighborhood
30            BsmtQual
20        YearRemodAdd
37           BsmtUnfSF
67         OpenPorchSF
59         GarageYrBlt
66          WoodDeckSF
44            2ndFlrSF
1           MSSubClass
0                   Id
Name: col, dtype: object

In [173]:
coef_ridge = pd.DataFrame({'col' : X_train.columns,
                           'importance' : model_ridge.coef_})
coef_ridge[coef_ridge.col.isin(list(top_tree_features))].shape

    
#imp_coef = coef.sort_values().head(nb_important)
#imp_coef.plot(kind = "barh", figsize=(10, 8))
#plt.title("Coefficients in Model")


Out[173]:
(20, 2)

In [175]:
coef_ridge.tail()


Out[175]:
col importance
271 SaleCondition_AdjLand 0.011574
272 SaleCondition_Alloca -0.004819
273 SaleCondition_Family -0.008842
274 SaleCondition_Normal 0.031992
275 SaleCondition_Partial 0.015414

In [183]:
rfr = RandomForestRegressor(n_estimators=100, random_state=0, n_jobs=-1)
rfr


Out[183]:
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=-1, oob_score=False, random_state=0,
           verbose=0, warm_start=False)

In [184]:
score = cross_validation(rfr, train, y)
print score.mean()


0.0948377547648

In [185]:
RandomForestRegressor?

In [204]:
cv_rfr = []
n_estimators = [10, 50, 100, 200]
max_depths = [3, 5, 7]


for n_estimator in n_estimators:
    for max_depth in max_depths:
        print "Je lance n_estimator : " + str(n_estimator) + " et "+str(max_depth) + " max_depth."
        score = cross_validation(RandomForestRegressor(n_estimators=n_estimator,
                                                       max_depth=max_depth,
                                                       random_state=0), train, y).mean() 
        cv_rfr.append({'n_estimator' : n_estimator,
                       'max_depths' : max_depth,
                        'score' : score})


Je lance n_estimator : 10 et 3 max_depth.
Je lance n_estimator : 10 et 5 max_depth.
Je lance n_estimator : 10 et 7 max_depth.
Je lance n_estimator : 50 et 3 max_depth.
Je lance n_estimator : 50 et 5 max_depth.
Je lance n_estimator : 50 et 7 max_depth.
Je lance n_estimator : 100 et 3 max_depth.
Je lance n_estimator : 100 et 5 max_depth.
Je lance n_estimator : 100 et 7 max_depth.
Je lance n_estimator : 200 et 3 max_depth.
Je lance n_estimator : 200 et 5 max_depth.
Je lance n_estimator : 200 et 7 max_depth.

In [205]:
cv_rfr_df = pd.DataFrame(cv_rfr)

In [206]:
cv_rfr_df


Out[206]:
max_depths n_estimator score
0 3 10 0.144042
1 5 10 0.115688
2 7 10 0.105318
3 3 50 0.142745
4 5 50 0.112303
5 7 50 0.100993
6 3 100 0.142165
7 5 100 0.111448
8 7 100 0.100080
9 3 200 0.142516
10 5 200 0.111284
11 7 200 0.099543

In [212]:
from sklearn.model_selection import GridSearchCV
param_grid = { "n_estimators"      : [250, 300],
           "max_depth"         : [3, 5, 7, 9]}
#grid_search = GridSearchCV(rfr, param_grid, n_jobs=-1, cv=5)

In [213]:
grid_search = GridSearchCV(rfr,
                           param_grid,
                           n_jobs=-1,
                           cv=5,
                           scoring='neg_mean_absolute_error')

In [214]:
grid_search.fit(train, y)
#print grid_search.best_params_


Out[214]:
GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=-1, oob_score=False, random_state=0,
           verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'n_estimators': [250, 300], 'max_depth': [3, 5, 7, 9]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_absolute_error', verbose=0)

In [215]:
grid_search.grid_scores_


/Users/babou/anaconda/lib/python2.7/site-packages/sklearn/model_selection/_search.py:667: DeprecationWarning: The grid_scores_ attribute was deprecated in version 0.18 in favor of the more elaborate cv_results_ attribute. The grid_scores_ attribute will not be available from 0.20
  DeprecationWarning)
Out[215]:
[mean: -0.14276, std: 0.00559, params: {'n_estimators': 250, 'max_depth': 3},
 mean: -0.14265, std: 0.00533, params: {'n_estimators': 300, 'max_depth': 3},
 mean: -0.11145, std: 0.00313, params: {'n_estimators': 250, 'max_depth': 5},
 mean: -0.11137, std: 0.00311, params: {'n_estimators': 300, 'max_depth': 5},
 mean: -0.09962, std: 0.00326, params: {'n_estimators': 250, 'max_depth': 7},
 mean: -0.09963, std: 0.00317, params: {'n_estimators': 300, 'max_depth': 7},
 mean: -0.09596, std: 0.00311, params: {'n_estimators': 250, 'max_depth': 9},
 mean: -0.09587, std: 0.00305, params: {'n_estimators': 300, 'max_depth': 9}]

In [216]:
print grid_search.best_params_


{'n_estimators': 300, 'max_depth': 9}

In [217]:
rfr = RandomForestRegressor(n_estimators=300, max_depth=9
                            , random_state=0)

In [219]:
X_train, X_validation, y_train, y_validation = train_test_split(train, y, random_state = 3)

In [221]:
rfr.fit(X_train, y_train)


Out[221]:
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=9,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=300, n_jobs=1, oob_score=False, random_state=0,
           verbose=0, warm_start=False)

In [222]:
mes_predictions_exp = np.expm1(rfr.predict(X_validation))

In [223]:
mes_predictions_exp[0:5]


Out[223]:
array([ 109968.10111552,  125796.22152704,  197819.32753382,
        132246.90615991,  132478.10448292])

In [224]:
y_validation_exp = np.exp(y_validation)
y_validation_exp[0:5]


Out[224]:
140    115001.0
951    119901.0
248    180001.0
738    179001.0
569    135961.0
Name: SalePrice, dtype: float64

In [225]:
mean_absolute_error(y_validation_exp, mes_predictions_exp)


Out[225]:
18059.93165581434

In [226]:
plt.scatter(mes_predictions_exp, y_validation_exp)
plt.plot([min(mes_predictions_exp),max(mes_predictions_exp)], [min(mes_predictions_exp),max(mes_predictions_exp)]
         , c="red")
plt.xlabel('Mes predicitons')
plt.ylabel('Vrai valeurs')


Out[226]:
<matplotlib.text.Text at 0x11a19c9d0>

yeah mixer 2 algos !!!


In [ ]:
model_ridge = Ridge(alpha=13.5, random_state=42)

In [233]:
mes_predictions_ridge = np.expm1(model_ridge.predict(X_validation_ridge))

In [234]:
mes_predictions_ridge[0:5]


Out[234]:
array([  96823.69822388,  113956.87312754,  192531.99426733,
        131920.8609816 ,  142354.91206622])

In [235]:
mes_predictions_exp[0:5]


Out[235]:
array([ 109968.10111552,  125796.22152704,  197819.32753382,
        132246.90615991,  132478.10448292])

In [237]:
resultat = pd.DataFrame({'ridge' : mes_predictions_ridge,
                       'tree' : mes_predictions_exp,
                       'realite' : y_validation_exp})

In [239]:
resultat['moyenne'] = (resultat.ridge+ resultat.tree) / 2.0
resultat.head()


Out[239]:
realite ridge tree moyenne
140 115001.0 96823.698224 109968.101116 103395.899670
951 119901.0 113956.873128 125796.221527 119876.547327
248 180001.0 192531.994267 197819.327534 195175.660901
738 179001.0 131920.860982 132246.906160 132083.883571
569 135961.0 142354.912066 132478.104483 137416.508275

In [240]:
mean_absolute_error(resultat.realite, resultat.ridge)


Out[240]:
13535.317184674073

In [241]:
mean_absolute_error(resultat.realite, resultat.tree)


Out[241]:
18059.93165581434

In [242]:
mean_absolute_error(resultat.realite, resultat.moyenne)


Out[242]:
13998.791706156873

Importance des features dans un tree :


In [59]:
coef = pd.Series(rfr.feature_importances_, index = X_train.columns).sort_values(ascending=False)

plt.figure(figsize=(10, 5))
coef.head(25).plot(kind='bar')
plt.title('Feature Significance')


Out[59]:
<matplotlib.text.Text at 0x113a20f10>

In [ ]: